24-nov
- t-SNE followed by spectral clustering
Train and adjust parameters
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import warnings
warnings.simplefilter('ignore',DeprecationWarning)
import seaborn as sns
import time
import copy
from pylab import rcParams
#import hdbscan
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn import metrics
from sklearn import metrics as mt
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix as conf
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.cluster import KMeans
from tabulate import tabulate
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from __future__ import print_function
data_dir = '../data/'
data_file = 'mashable_clean_dataset_for_lab_03.csv'
file_2_read = data_dir + data_file
df = pd.read_csv(file_2_read)
df_cluster = copy.deepcopy(df)
#del df_cluster['data_channel']
# ... read in original data set to retreive 'shares' values
data_file = 'OnlineNewsPopularity.csv'
file_2_read = data_dir + data_file
df_ONP = pd.read_csv(file_2_read)
df_ONP.columns = df_ONP.columns.str.strip()
df_ONP = df_ONP[['shares']]
df_ONP['ln_shares'] = np.log(df_ONP['shares']+1)
df_ONP['popular'] = np.where(df_ONP['shares'] > 1400, True, False)
col_names = df_cluster.columns.values.tolist()
col_names
# set required variables for model comparison
comparison_tbl = pd.DataFrame(columns = [
'model_name',
'n_clusters',
'inertia',
'silhouette',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
from sklearn.manifold import TSNE
X1 = df_cluster
X1['ln_shares'] = df_ONP['ln_shares']
X1['popular'] = df_ONP['popular']
X1 = X1.sample(frac = 0.30)
X1_ln_shares = X1['ln_shares']
X1_popular = X1['popular']
columns_to_drop = ['ln_shares', 'popular']
X1.drop(columns_to_drop, axis = 1, inplace = True)
tic = time.clock()
tsne = TSNE(n_components = 2, verbose = 1, perplexity = 5, n_iter = 300)
tsne_results = tsne.fit_transform(X1)
toc = time.clock()
print (toc - tic)
from ggplot import *
from ggplot import scale_fill_brewer
from matplotlib import cm
df_tsne = copy.deepcopy(X1)
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]
col_names = df_tsne.columns.values.tolist()
for col in col_names :
plt.figure(figsize=(12, 8));
plt.subplot(111, axisbg='darkgrey');
plt.scatter(df_tsne['x-tsne'], df_tsne['y-tsne'],
c = df_tsne[col],
cmap = plt.cm.Spectral,
s = 50,
linewidths = 0,
alpha = 0.30)
plt.colorbar()
plt.xlabel('t-SNE axis 1')
plt.ylabel('t-SNE axis 2')
plt.title(col)
plt.grid(True)
plt.show();
tsne_results
len(tsne_results)
# set required variables for model comparison
tsne_tbl = pd.DataFrame(columns = [
'model_name',
'n_clusters',
'inertia',
'silhouette',
'process_time'])
i_index = []
i_index = 0
# preparation for cross validation and model comparison, each classifier is appended once model is fit
models = []
# ... spectraclustering on the t-sne vectors
X_tsne = pd.DataFrame(columns=['t1', 't2'])
X_tsne['t1'] = tsne_results[:,0]
X_tsne['t2'] = tsne_results[:,1]
from sklearn.cluster import SpectralClustering
# If a string, this may be one of
# ‘nearest_neighbors’, ‘precomputed’, ‘rbf’
# or one of the kernels supported by sklearn.metrics.pairwise_kernels
for n_clstr in range(2, 10):
tic = time.clock()
print ("n_clusters = ", n_clstr)
spc = SpectralClustering(n_clusters = n_clstr,
affinity = 'nearest_neighbors')
spc_labels = spc.fit_predict(X_tsne)
spc_labels
spc_silhouette = metrics.silhouette_score(X_tsne,
spc_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", spc_silhouette)
toc = time.clock()
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - save statistics for model comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
exe_time = '{0:.4f}'.format(toc-tic)
raw_data = {
'model_name' : 'spc - features',
'n_clusters' : n_clstr,
'inertia': 0,
'silhouette': spc_silhouette,
'process_time' : exe_time
}
df_tbl = pd.DataFrame(raw_data,
columns = ['model_name', 'n_clusters', 'inertia', 'silhouette', 'process_time'],
index = [i_index + 1])
tsne_tbl = tsne_tbl.append(df_tbl)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - make some plots of clusters
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(12, 8));
plt.subplot(111, axisbg='darkgrey');
X_tsne_values = X_tsne.values;
plt.scatter(X_tsne_values[:, 0], X_tsne_values[:, 1],
c = spc_labels,
cmap = plt.cm.Paired,
s = 50,
linewidths = 0,
alpha = 0.20);
plt.xlabel('t-SNE axis 1')
plt.ylabel('t-SNE axis 2')
title = print('n_clusters =', n_clstr);
plt.title('title')
plt.grid(True);
plt.show();
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... -
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... - plot metrics across models for comparison
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
plt.figure(figsize=(16, 6));
# ... silhouette values
plt.subplot(131);
plt.scatter(tsne_tbl['n_clusters'],
tsne_tbl['silhouette'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(tsne_tbl['n_clusters'],
tsne_tbl['silhouette'])
plt.xlabel('n_clusters'), plt.ylabel('silhouette');
plt.grid();
# ... inertia values
plt.subplot(132);
plt.scatter(tsne_tbl['n_clusters'],
tsne_tbl['inertia'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.plot(tsne_tbl['n_clusters'],
tsne_tbl['inertia'])
plt.xlabel('n_clusters'), plt.ylabel('inertia');
plt.grid();
# ... process time
plt.subplot(133);
plt.scatter(tsne_tbl['n_clusters'],
tsne_tbl['process_time'],
s = 40,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
#plt.plot(tsne_tbl['n_clusters'],
# tsne_tbl['process_time'])
plt.xlabel('n_clusters'), plt.ylabel('process_time');
plt.grid();
plt.show();
n_clusters_chosen = 8
for n_clstr in range(n_clusters_chosen, n_clusters_chosen+1):
tic = time.clock()
print ("n_clusters = ", n_clstr)
spc = SpectralClustering(n_clusters = n_clstr,
affinity = 'nearest_neighbors')
spc_labels = spc.fit_predict(X_tsne)
spc_labels
spc_silhouette = metrics.silhouette_score(X_tsne,
spc_labels,
metric = 'euclidean',
sample_size = 10000)
print ("silhouette = ", spc_silhouette)
toc = time.clock()
X_all_together = copy.deepcopy(X1)
len(X_all_together)
X_all_together['ln_shares'] = X1_ln_shares
X_all_together['popular'] = X1_popular
X_all_together['spc_labels'] = spc_labels
X_all_together['t1'] = tsne_results[:,0]
X_all_together['t2'] = tsne_results[:,1]
# boxplot across clusters for each feature ...
import seaborn as sns
col_names = X_all_together.columns.values.tolist()
for col in col_names :
_ = plt.figure(figsize=(24, 8));
# ... feature distribution color map
_ = plt.subplot(131, axisbg='darkgrey');
_ = plt.scatter(X_all_together['t1'], X_all_together['t2'],
c = X_all_together[col],
cmap = plt.cm.Spectral,
s = 50,
linewidths = 0,
alpha = 0.30)
_ = plt.title(col)
# ... feature boxplots
_ = plt.subplot(132, axisbg='darkgrey');
sns.boxplot(x = "spc_labels", y = col, data = X_all_together);
# ... cluster color map
_ = plt.subplot(133, axisbg='darkgrey');
_ = plt.scatter(X_all_together['t1'], X_all_together['t2'],
c = spc_labels,
cmap = plt.cm.tab20,
s = 50,
linewidths = 0,
alpha = 0.30)
_ = plt.xlabel('t-SNE axis 1')
_ = plt.ylabel('t-SNE axis 2')
_ = plt.title('t-SNE 2-D mapping')
plt.show();